# 加载 all_words.txt 文件，获取所有的词汇
with open('./models/all_words.txt', 'r', encoding='utf-8') as f:
    all_words = set(line.strip() for line in f)

# 加载带索引的文件，筛选出在 all_words 中存在的词汇W
filtered_words = set()  # 用集合来避免重复
with open('word-split-with-line-v19.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(',')
        word = parts[0]  # 词汇在第一列
        cluster = parts[1]  # 行号是第二列
        # 确保词汇存在于 all_words 中，且 (词汇, 行号) 组合之前没有出现过
        if word in all_words and (word, cluster) not in filtered_words:
            filtered_words.add((word, cluster))

# 将筛选后的结果保存到新文件
with open('./models/filtered_indexed_words.txt', 'w', encoding='utf-8') as f:
    for word, cluster in filtered_words:
        f.write(f"{word},{cluster}\n")

print(f"已筛选并保存 {len(filtered_words)} 个不重复的词汇及其索引")
